"""
Created by LiXuefei in 2017.07.26

"""
import urllib.request
from urllib.request import urlopen
import pandas as pd
# beautifulsoup方法，第三方库的方法，爬找网页
## 下载网页
from bs4 import BeautifulSoup

def get_content(url):
    headers = {'User-Agent': 'mozilla/5.0 (windows nt 6.1; wow64) applewebkit/537.36 (khtml, like gecko) chrome/27.0.1453.94 safari/537.36'}
    #Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'}
    req = urllib.request.Request(url=url, headers=headers)
    content = urllib.request.urlopen(req).read().decode('utf-8')
    return content

def get_txt(info):
    info_name = []
    info_time = []

    soup = BeautifulSoup(info,"lxml")  # 设置解析器为“lxml”
    #print(soup)
    name = soup.select(".item > .info ul li a em") # .intro"#.product-image-and-name-container > .product-image')#.product-name product-image
    information = soup.select(".item > .info ul > .intro")#空格的问题
    time = soup.select(".item > .info ul li > .date ")
    for names in name:
        temp_name = str(names).strip('<em>' + '</em>' + '\n' + '\'')
        info_name.append(temp_name)
        #dataset['name'].append(temp_name)
    for x in range(len(time)):
        time[x] = str(time[x]).strip('<span class="date">' + '</span>' + '\n' + '\'')
        info_time.append(time[x])
    return info_name,info_time

info = []
time = []
for k in range(25):
    start = str(15*k)
    temp_url = "https://movie.douban.com/people/62209085/collect?start=" + start + "&sort=time&rating=all&filter=all&mode=grid" #"
    content = get_content(temp_url)
    mov_name,mov_time = get_txt(content)

    x = 0
    for x in range(len(mov_name)):
        info.append(mov_name[x])
        time.append(mov_time[x])
        print("success")
        print(len(info),len(time))
dataset = pd.DataFrame()
dataset['info'] = info
dataset['time'] = time
#
dataset.to_csv('/Users/lixuefei/Desktop/movie.csv')
#
# print(dataset)

